GroupBy¶

In [1]:
import pandas as pd

# create a dataframe of car models by two companies
df = pd.DataFrame({
    'Company': ['A', 'A', 'A', 'B', 'B', 'B', 'B'],
    'Model': ['A1', 'A2', 'A3', 'B1', 'B2', 'B3', 'B4'],
    'Year': [2019, 2020, 2021, 2018, 2019, 2020, 2021],
    'Transmission': ['Manual', 'Automatic', 'Automatic', 
            'Manual', 'Automatic', 'Automatic', 'Manual'],
    'EngineSize': [1.4, 2.0, 1.4, 1.5, 2.0, 1.5, 1.5],
    'MPG': [55.4, 67.3, 58.9, 52.3, 64.2, 68.9, 83.1]
})
# display the dataframe
df
Out[1]:
Company Model Year Transmission EngineSize MPG
0 A A1 2019 Manual 1.4 55.4
1 A A2 2020 Automatic 2.0 67.3
2 A A3 2021 Automatic 1.4 58.9
3 B B1 2018 Manual 1.5 52.3
4 B B2 2019 Automatic 2.0 64.2
5 B B3 2020 Automatic 1.5 68.9
6 B B4 2021 Manual 1.5 83.1
In [4]:
# groupby columns on Col1 and estimate the 
# maximum value of column Col2 for each group
# df.groupby([Col1])[Col2].max()
df.groupby(["Company"])["MPG"].max()
Out[4]:
Company
A    67.3
B    83.1
Name: MPG, dtype: float64
In [5]:
# alternatively, you can pass 'max' to the agg() function
df.groupby(["Company"])["MPG"].agg('max')
Out[5]:
Company
A    67.3
B    83.1
Name: MPG, dtype: float64
In [10]:
df.groupby(["Company"])["MPG"].agg(['max','mean','count','std'])
Out[10]:
max mean count std
Company
A 67.3 60.533333 3 6.115826
B 83.1 67.125000 4 12.736921
C 70.0 70.000000 1 NaN

Add new row¶

Use pd.concat([df, new_df], ignore_index=True) to add new row of data.¶

In [7]:
# DO NOT USE df.append()
df.append(pd.DataFrame({
    "Company":["C"],
    "Model":["C1"],
    "Year":[2023],
    "Transmission":["Automatic"],
    "EngineSize":[1.8] ,
    "MPG":[70],
}))
---------------------------------------------------------------------------
AttributeError                            Traceback (most recent call last)
Cell In[7], line 2
      1 # DO NOT USE df.append()
----> 2 df.append(pd.DataFrame({
      3     "Company":["C"],
      4     "Model":["C1"],
      5     "Year":[2023],
      6     "Transmission":["Automatic"],
      7     "EngineSize":[1.8] ,
      8     "MPG":[70],
      9 }))

File ~\anaconda3\Lib\site-packages\pandas\core\generic.py:6204, in NDFrame.__getattr__(self, name)
   6197 if (
   6198     name not in self._internal_names_set
   6199     and name not in self._metadata
   6200     and name not in self._accessors
   6201     and self._info_axis._can_hold_identifiers_and_holds_name(name)
   6202 ):
   6203     return self[name]
-> 6204 return object.__getattribute__(self, name)

AttributeError: 'DataFrame' object has no attribute 'append'
In [8]:
df = pd.concat([df, pd.DataFrame({
                                "Company":["C"],
                                "Model":["C1"],
                                "Year":[2023],
                                "Transmission":["Automatic"],
                                "EngineSize":[1.8] ,
                                "MPG":[70],
                                })
                ], ignore_index=True)
In [9]:
df
Out[9]:
Company Model Year Transmission EngineSize MPG
0 A A1 2019 Manual 1.4 55.4
1 A A2 2020 Automatic 2.0 67.3
2 A A3 2021 Automatic 1.4 58.9
3 B B1 2018 Manual 1.5 52.3
4 B B2 2019 Automatic 2.0 64.2
5 B B3 2020 Automatic 1.5 68.9
6 B B4 2021 Manual 1.5 83.1
7 C C1 2023 Automatic 1.8 70.0

Access a row¶

In [8]:
df.iloc[5]
Out[8]:
Company                 B
Model                  B3
Year                 2020
Transmission    Automatic
EngineSize            1.5
MPG                  68.9
Name: 5, dtype: object

image.png

df.sort_values¶

In [11]:
df.sort_values(by=["Year","MPG"], ascending=[True,False])
Out[11]:
Company Model Year Transmission EngineSize MPG
3 B B1 2018 Manual 1.5 52.3
4 B B2 2019 Automatic 2.0 64.2
0 A A1 2019 Manual 1.4 55.4
5 B B3 2020 Automatic 1.5 68.9
1 A A2 2020 Automatic 2.0 67.3
6 B B4 2021 Manual 1.5 83.1
2 A A3 2021 Automatic 1.4 58.9
7 C C1 2023 Automatic 1.8 70.0
In [10]:
df1 = df.sort_values(by="Year", ascending=True)
df1
Out[10]:
Company Model Year Transmission EngineSize MPG
3 B B1 2018 Manual 1.5 52.3
0 A A1 2019 Manual 1.4 55.4
4 B B2 2019 Automatic 2.0 64.2
1 A A2 2020 Automatic 2.0 67.3
5 B B3 2020 Automatic 1.5 68.9
2 A A3 2021 Automatic 1.4 58.9
6 B B4 2021 Manual 1.5 83.1
7 C C1 2023 Automatic 1.8 70.0
In [12]:
df1 = df.sort_values(by=["Company","Year"], ascending=[True,False])
df1
Out[12]:
Company Model Year Transmission EngineSize MPG
2 A A3 2021 Automatic 1.4 58.9
1 A A2 2020 Automatic 2.0 67.3
0 A A1 2019 Manual 1.4 55.4
6 B B4 2021 Manual 1.5 83.1
5 B B3 2020 Automatic 1.5 68.9
4 B B2 2019 Automatic 2.0 64.2
3 B B1 2018 Manual 1.5 52.3
7 C C1 2023 Automatic 1.8 70.0

df.set_index¶

In [13]:
df.set_index('Model')
Out[13]:
Company Year Transmission EngineSize MPG
Model
A1 A 2019 Manual 1.4 55.4
A2 A 2020 Automatic 2.0 67.3
A3 A 2021 Automatic 1.4 58.9
B1 B 2018 Manual 1.5 52.3
B2 B 2019 Automatic 2.0 64.2
B3 B 2020 Automatic 1.5 68.9
B4 B 2021 Manual 1.5 83.1
C1 C 2023 Automatic 1.8 70.0

df.reset_index¶

In [15]:
df1.reset_index(drop=True)
Out[15]:
Company Model Year Transmission EngineSize MPG
0 A A3 2021 Automatic 1.4 58.9
1 A A2 2020 Automatic 2.0 67.3
2 A A1 2019 Manual 1.4 55.4
3 B B4 2021 Manual 1.5 83.1
4 B B3 2020 Automatic 1.5 68.9
5 B B2 2019 Automatic 2.0 64.2
6 B B1 2018 Manual 1.5 52.3
7 C C1 2023 Automatic 1.8 70.0

Alter column/series order¶

In [14]:
df
Out[14]:
Company Model Year Transmission EngineSize MPG
0 A A1 2019 Manual 1.4 55.4
1 A A2 2020 Automatic 2.0 67.3
2 A A3 2021 Automatic 1.4 58.9
3 B B1 2018 Manual 1.5 52.3
4 B B2 2019 Automatic 2.0 64.2
5 B B3 2020 Automatic 1.5 68.9
6 B B4 2021 Manual 1.5 83.1
7 C C1 2023 Automatic 1.8 70.0
In [15]:
df[["MPG","EngineSize","Transmission","Company","Model","Year"]]
Out[15]:
MPG EngineSize Transmission Company Model Year
0 55.4 1.4 Manual A A1 2019
1 67.3 2.0 Automatic A A2 2020
2 58.9 1.4 Automatic A A3 2021
3 52.3 1.5 Manual B B1 2018
4 64.2 2.0 Automatic B B2 2019
5 68.9 1.5 Automatic B B3 2020
6 83.1 1.5 Manual B B4 2021
7 70.0 1.8 Automatic C C1 2023
In [16]:
df.iloc[:,[5,4,3,0,1,2]]
Out[16]:
MPG EngineSize Transmission Company Model Year
0 55.4 1.4 Manual A A1 2019
1 67.3 2.0 Automatic A A2 2020
2 58.9 1.4 Automatic A A3 2021
3 52.3 1.5 Manual B B1 2018
4 64.2 2.0 Automatic B B2 2019
5 68.9 1.5 Automatic B B3 2020
6 83.1 1.5 Manual B B4 2021
7 70.0 1.8 Automatic C C1 2023

Series.unique¶

In [17]:
df["EngineSize"].unique()
Out[17]:
array([1.4, 2. , 1.5, 1.8])
In [27]:
df["Company"].unique().tolist()
Out[27]:
['A', 'B', 'C']
In [19]:
df["Year"].unique()
Out[19]:
array([2019, 2020, 2021, 2018, 2023])

Series.nunique¶

In [20]:
df["EngineSize"].nunique()
Out[20]:
4
In [21]:
df["Year"].nunique()
Out[21]:
5
In [22]:
df["Transmission"].nunique()
Out[22]:
2

Return a Series containing counts of unique values.¶

In [20]:
df["Year"].value_counts()
Out[20]:
Year
2019    2
2020    2
2021    2
2018    1
2023    1
Name: count, dtype: int64

pd.read_json¶

In [47]:
link = "https://api.data.gov.hk/v2/filter?q=%7B%22resource%22%3A%22http%3A%2F%2Fwww.rvd.gov.hk%2Fdoc%2Fen%2Fstatistics%2Fhis_data_16.xls%22%2C%22section%22%3A1%2C%22format%22%3A%22json%22%7D"
data = pd.read_json(link)
In [48]:
data.columns
Out[48]:
Index(['年 Year', '月 Month', '一手買賣 數目 Primary Sales No.',
       '一手買賣 總值 (百萬元) Primary Sales Consideration ($ million)',
       '二手買賣 數目 Secondary Sales No.',
       '二手買賣 總值 (百萬元) Secondary Sales Consideration ($ million)'],
      dtype='object')
In [53]:
# change the column name
data = data.rename(columns = {'年 Year':'Year',
                              '月 Month':'Month',
                              '一手買賣 數目 Primary Sales No.':'FirstHandSales',
                              '一手買賣 總值 (百萬元) Primary Sales Consideration ($ million)':
                              'FirstConsideration',
                              '二手買賣 數目 Secondary Sales No.':'SecondHandSales',
                              '二手買賣 總值 (百萬元) Secondary Sales Consideration ($ million)':
                              'SecondConsideration'
    })
In [56]:
data.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 266 entries, 0 to 265
Data columns (total 6 columns):
 #   Column               Non-Null Count  Dtype
---  ------               --------------  -----
 0   Year                 266 non-null    int64
 1   Month                266 non-null    int64
 2   FirstHandSales       266 non-null    int64
 3   FirstConsideration   266 non-null    int64
 4   SecondHandSales      266 non-null    int64
 5   SecondConsideration  266 non-null    int64
dtypes: int64(6)
memory usage: 12.6 KB
In [57]:
# join Year and Month as new column
data["DateTime"] = data["Year"].astype(str) + "-" + data["Month"].astype(str)
# drop Year and Month
data = data.drop(['Year','Month'], axis=1)
# set DateTime as index
data = data.set_index('DateTime')
data
Out[57]:
FirstHandSales FirstConsideration SecondHandSales SecondConsideration
DateTime
2002-1 2718 8218 5129 9966
2002-2 1506 4305 4189 8070
2002-3 1709 4190 3741 6795
2002-4 2543 6516 4866 8739
2002-5 2257 5844 5068 11097
... ... ... ... ...
2023-10 356 4805 1767 19693
2023-11 547 6318 2007 13352
2023-12 959 9684 1970 14640
2024-1 1003 8336 2474 19456
2024-2 367 5677 2008 13419

266 rows × 4 columns

In [58]:
data.columns
Out[58]:
Index(['FirstHandSales', 'FirstConsideration', 'SecondHandSales',
       'SecondConsideration'],
      dtype='object')
In [60]:
# pip install plotly
Requirement already satisfied: plotly in c:\users\user\anaconda3\lib\site-packages (5.19.0)
Requirement already satisfied: tenacity>=6.2.0 in c:\users\user\anaconda3\lib\site-packages (from plotly) (8.2.2)
Requirement already satisfied: packaging in c:\users\user\anaconda3\lib\site-packages (from plotly) (23.1)
Note: you may need to restart the kernel to use updated packages.
In [59]:
import plotly.express as px
fig = px.line(data, x=data.index, y=["FirstHandSales","FirstConsideration"], 
              title='First Hand Sales')
fig.show()

image.png

In [66]:
data.corr()
Out[66]:
FirstHandSales FirstConsideration SecondHandSales SecondConsideration
FirstHandSales 1.000000 0.540893 0.131269 0.046023
FirstConsideration 0.540893 1.000000 -0.014293 0.554104
SecondHandSales 0.131269 -0.014293 1.000000 0.491342
SecondConsideration 0.046023 0.554104 0.491342 1.000000
In [67]:
data['FirstHandSales'].corr(data['SecondHandSales'])
Out[67]:
0.13126927888869652
In [64]:
fig = px.imshow(data, text_auto=True)
fig.show()

image.png

In [70]:
data#.T.T
Out[70]:
FirstHandSales FirstConsideration SecondHandSales SecondConsideration
DateTime
2002-1 2718 8218 5129 9966
2002-2 1506 4305 4189 8070
2002-3 1709 4190 3741 6795
2002-4 2543 6516 4866 8739
2002-5 2257 5844 5068 11097
... ... ... ... ...
2023-10 356 4805 1767 19693
2023-11 547 6318 2007 13352
2023-12 959 9684 1970 14640
2024-1 1003 8336 2474 19456
2024-2 367 5677 2008 13419

266 rows × 4 columns

image.png

In [75]:
data.to_numpy()
Out[75]:
array([[ 2718,  8218,  5129,  9966],
       [ 1506,  4305,  4189,  8070],
       [ 1709,  4190,  3741,  6795],
       ...,
       [  959,  9684,  1970, 14640],
       [ 1003,  8336,  2474, 19456],
       [  367,  5677,  2008, 13419]], dtype=int64)
In [77]:
data["FirstHandSales"].to_numpy()
Out[77]:
array([2718, 1506, 1709, 2543, 2257, 2087,  755,  796, 3278, 2282, 1340,
       1817, 2291,  966, 1655, 2431, 1257, 1602, 2765, 2766, 1867, 3958,
       2602, 2338, 2894, 3734, 3705, 1379,  961, 2100, 2421, 1099, 1558,
       1801, 2576, 1466, 1056, 2092,  993,  918, 2471, 2028, 1150,  569,
        524, 2762, 1228,  203,  148,  251,  895,  781, 1137, 1105,  552,
       1421, 3929, 1165,  953, 1649,  994, 1267,  834, 2410, 2293, 1173,
       1402, 1886, 2434, 1901, 2573,  956, 1702,  690, 2399,  540,  638,
       2058,  457,  216,  917,  403,  131,  895,  507,  232,  863, 1587,
       2525, 2717, 1307, 2301, 1532,  627,  753, 1210,  975,  564, 2028,
       1975,  685, 1318, 1137, 1531,  556, 1656,  747,  474,  296,  486,
       1123,  741, 2054, 1050,  608,  884,  231,  822, 1189, 1396,  879,
        581, 1435,  804, 1092,  827, 1670, 1397,  644, 2264, 1066,  309,
        632, 1197, 1095, 1045, 1328,  135,  211,  546,  871,  773, 1151,
       2062, 1760, 1142,  693, 1136,  841, 1375, 2507, 1606, 1182, 1723,
       1119, 1773, 1465, 1540,  738, 1634, 1808, 1547, 1122,  764, 1602,
       1404, 1049, 2153,  450,  234,  641, 1294, 1476, 1538, 1142, 1628,
       3474, 2251, 2212,  453,  664, 1474, 1435, 2650, 1275, 2320,  952,
       1236, 2389, 1541, 1416, 1293,  828, 1168,  738, 2122,  756, 2070,
       1753, 1490, 1252, 2337,  720,  399, 2191, 1654, 1468, 2789, 3246,
       1191, 1810, 1252, 1338, 1387, 2112,  670,  626, 1016,  619,  775,
       1179, 2195, 1673, 1068,  808, 1046, 2201, 2111,  586, 1553, 1553,
       1067, 1572, 1954, 2052,  708, 1926, 1297, 1868, 1514, 1081,  481,
        161,  258, 1492, 1382, 1177, 1564, 1435,  672,  365,  247,  363,
        655, 1787, 1550, 1000, 1019,  810,  742,  964,  356,  547,  959,
       1003,  367], dtype=int64)

Time Series¶

In [79]:
tnx = pd.read_csv("^TNX.csv")
tnx.tail(5)
Out[79]:
Date Open High Low Close Adj Close Volume
257 2024-07-24 4.229 4.292 4.210 4.286 4.286 0.0
258 2024-07-25 4.206 4.266 4.196 4.256 4.256 0.0
259 2024-07-26 4.245 4.252 4.192 4.200 4.200 0.0
260 2024-07-29 4.163 4.186 4.151 4.178 4.178 0.0
261 2024-07-30 4.174 4.184 4.130 4.143 4.143 0.0
In [80]:
tnx.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 262 entries, 0 to 261
Data columns (total 7 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   Date       262 non-null    object 
 1   Open       252 non-null    float64
 2   High       252 non-null    float64
 3   Low        252 non-null    float64
 4   Close      252 non-null    float64
 5   Adj Close  252 non-null    float64
 6   Volume     252 non-null    float64
dtypes: float64(6), object(1)
memory usage: 14.5+ KB
In [82]:
tnx['Date'] = pd.to_datetime(tnx['Date'],format="%Y-%m-%d")
tnx.Date
Out[82]:
0     2023-07-31
1     2023-08-01
2     2023-08-02
3     2023-08-03
4     2023-08-04
         ...    
257   2024-07-24
258   2024-07-25
259   2024-07-26
260   2024-07-29
261   2024-07-30
Name: Date, Length: 262, dtype: datetime64[ns]
In [83]:
tnx = tnx.set_index('Date') 
tnx.info()
<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 262 entries, 2023-07-31 to 2024-07-30
Data columns (total 6 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   Open       252 non-null    float64
 1   High       252 non-null    float64
 2   Low        252 non-null    float64
 3   Close      252 non-null    float64
 4   Adj Close  252 non-null    float64
 5   Volume     252 non-null    float64
dtypes: float64(6)
memory usage: 14.3 KB
In [84]:
tnx.tail(3)
Out[84]:
Open High Low Close Adj Close Volume
Date
2024-07-26 4.245 4.252 4.192 4.200 4.200 0.0
2024-07-29 4.163 4.186 4.151 4.178 4.178 0.0
2024-07-30 4.174 4.184 4.130 4.143 4.143 0.0
In [85]:
tnx.info()
<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 262 entries, 2023-07-31 to 2024-07-30
Data columns (total 6 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   Open       252 non-null    float64
 1   High       252 non-null    float64
 2   Low        252 non-null    float64
 3   Close      252 non-null    float64
 4   Adj Close  252 non-null    float64
 5   Volume     252 non-null    float64
dtypes: float64(6)
memory usage: 14.3 KB
In [106]:
df_dt = pd.DataFrame({'year': [2023, 2023],
                   'month': [1, 1],
                   'day': [1, 2],
                   'sales': [3750, 3900]})
df_dt
Out[106]:
year month day sales
0 2023 1 1 3750
1 2023 1 2 3900
In [107]:
df_dt['date'] = pd.to_datetime(df_dt[['year','month','day']])
df_dt = df_dt.drop(['year','month','day'], axis=1)
df_dt = df_dt.set_index('date')
df_dt
Out[107]:
sales
date
2023-01-01 3750
2023-01-02 3900
In [111]:
df_dt
Out[111]:
sales
date
2023-01-01 3750
2023-01-02 3900
In [112]:
df_dt.iloc[0] 
Out[112]:
sales    3750
Name: 2023-01-01 00:00:00, dtype: int64
In [40]:
df_dt.info()
<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 2 entries, 2023-01-01 to 2023-01-02
Data columns (total 1 columns):
 #   Column  Non-Null Count  Dtype
---  ------  --------------  -----
 0   sales   2 non-null      int64
dtypes: int64(1)
memory usage: 32.0 bytes
In [41]:
pd.to_datetime(1681038343, unit='s')
Out[41]:
Timestamp('2023-04-09 11:05:43')
In [42]:
pd.to_datetime(1681038343433502912, unit='ns')
Out[42]:
Timestamp('2023-04-09 11:05:43.433502912')

Categorical data¶

In [2]:
animal = pd.Series(["Bird","Cat", "Dog","Elephant"], dtype="category")
In [3]:
animal.info()
<class 'pandas.core.series.Series'>
RangeIndex: 4 entries, 0 to 3
Series name: None
Non-Null Count  Dtype   
--------------  -----   
4 non-null      category
dtypes: category(1)
memory usage: 340.0 bytes
In [4]:
import numpy as np
df_exam = pd.DataFrame({'Score': np.random.randint(1, 101, size=40)})
In [5]:
df_exam.head(6)
Out[5]:
Score
0 40
1 22
2 27
3 78
4 61
5 20
In [6]:
df_exam['Result'] = pd.Series(np.where(df_exam['Score']>70, 'Distiction',
                         np.where(df_exam['Score']>50, 'Pass', 'Fail') 
                    )).astype('category') 
In [9]:
df_exam.sample(5)
Out[9]:
Score Result
7 95 Distiction
0 40 Fail
27 70 Pass
22 38 Fail
38 85 Distiction
In [8]:
df_exam.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 40 entries, 0 to 39
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype   
---  ------  --------------  -----   
 0   Score   40 non-null     int32   
 1   Result  40 non-null     category
dtypes: category(1), int32(1)
memory usage: 464.0 bytes
In [ ]:
 
In [ ]: